Load libraries and files

library(here)
## here() starts at /Users/zohrazahir/Documents/GitHub/Species_distributions_and_traits
source(here("functions/R_package_check.R"))
## Loading required package: tidyverse
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'tidyverse'
## also installing the dependencies 'purrr', 'gargle', 'ids', 'selectr', 'dbplyr', 'dtplyr', 'googledrive', 'googlesheets4', 'haven', 'modelr', 'reprex', 'rvest', 'covr', 'feather'
## 
## The downloaded binary packages are in
##  /var/folders/x0/cz4tb35n5fbb5y4zh6srtrt00000gn/T//RtmpY52V84/downloaded_packages
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## Loading required package: vegan
## 
## Loading required package: permute
## 
## Loading required package: lattice
## 
## This is vegan 2.6-4
## 
## Loading required package: lubridate
## 
## 
## Attaching package: 'lubridate'
## 
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## 
## 
## Loading required package: lmodel2
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'lmodel2'
## 
## The downloaded binary packages are in
##  /var/folders/x0/cz4tb35n5fbb5y4zh6srtrt00000gn/T//RtmpY52V84/downloaded_packages
## Loading required package: openxlsx
## Loading required package: knitr
## Loading required package: PBSmapping
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'PBSmapping'
## 
## The downloaded binary packages are in
##  /var/folders/x0/cz4tb35n5fbb5y4zh6srtrt00000gn/T//RtmpY52V84/downloaded_packages
## 
## -----------------------------------------------------------
## PBS Mapping 2.73.2 -- Copyright (C) 2003-2023 Fisheries and Oceans Canada
## 
## PBS Mapping comes with ABSOLUTELY NO WARRANTY;
## for details see the file COPYING.
## This is free software, and you are welcome to redistribute
## it under certain conditions, as outlined in the above file.
## 
## A complete user guide 'PBSmapping-UG.pdf' is located at 
## /Library/Frameworks/R.framework/Versions/4.2/Resources/library/PBSmapping/doc/PBSmapping-UG.pdf
## 
## Packaged on 2022-09-06
## Pacific Biological Station, Nanaimo
## 
## All available PBS packages can be found at
## https://github.com/pbs-software
## 
## To see demos, type '.PBSfigs()'.
## -----------------------------------------------------------
## 
## 
## Loading required package: cowplot
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
## 
## Loading required package: viridis
## Loading required package: viridisLite
## Loading required package: ggbreak
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'ggbreak'
## also installing the dependencies 'magick', 'ggimage', 'prettydoc'
## 
## The downloaded binary packages are in
##  /var/folders/x0/cz4tb35n5fbb5y4zh6srtrt00000gn/T//RtmpY52V84/downloaded_packages
## ggbreak v0.1.1
## 
## If you use ggbreak in published research, please cite the following
## paper:
## 
## S Xu, M Chen, T Feng, L Zhan, L Zhou, G Yu. Use ggbreak to effectively
## utilize plotting space to deal with large datasets and outliers.
## Frontiers in Genetics. 2021, 12:774846. doi: 10.3389/fgene.2021.774846
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
## 
## Loading required package: Polychrome
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'Polychrome'
## also installing the dependency 'scatterplot3d'
## 
## The downloaded binary packages are in
##  /var/folders/x0/cz4tb35n5fbb5y4zh6srtrt00000gn/T//RtmpY52V84/downloaded_packages
prepareLibrary()
source(here("functions/QOL_toolkit.R"))

theme_set(theme_bw())
set_here(path = "..")
## File .here already exists in /Users/zohrazahir/Documents/GitHub/Species_distributions_and_traits
# Load trait dataset metadata tables
directory.trait <- openxlsx::read.xlsx(here("tables/trait_directory_20230207.xlsx"))
directory.lifestage <- openxlsx::read.xlsx(here("tables/lifestage_directory_20230207.xlsx"))
taxonomy <- openxlsx::read.xlsx(here("tables/taxonomy_table_20230207.xlsx"))
dataset.format <- openxlsx::read.xlsx(here("tables/trait_dataset_standard_format_20230118.xlsx"))[-1,]

# Load the trait dataset
load(here("data/zooplankton_traits-2023-02-12.RData"))

Contents of the trait dataset

The trait dataset is provided in two levels. Level 1 (traits.lvl1) contains individual-level records which can have multiple measurements of a trait for a species. Level 2 (traits.lvl2) contains species-level averages for numerical traits. Only taxa at the species or subspecies ranks are provided in the level 2 data. For the exploration below, we will look into the level 2 data.

# Assign the trait data level to analyze
zooplankton.traits <- traits.lvl2 

# Structure of the dataset
# The dataset starts with identifiers (cols 1-5), followed by the core trait information of the scientific name, trait name, trait value, trait unit, and data type. Many traits have ancillary information (cols 11-26) such as life stage, temperature, and notes. This are followed by the references (cols 27-30) and the taxonomic information (31-41) associated with the trait record. All taxa have a taxonID and most have an aphiaID to link it with the WoRMS database.Details regarding the data provenance of the original trait record are retained in columns 42-54. 
colnames(zooplankton.traits)
##  [1] "traitTaxonID"               "traitID"                   
##  [3] "taxonID"                    "stageID"                   
##  [5] "taxonRankID"                "scientificName"            
##  [7] "traitName"                  "traitValue"                
##  [9] "traitUnit"                  "dataType"                  
## [11] "lifeStage"                  "traitValueTemperature"     
## [13] "traitValueSD"               "traitValueN"               
## [15] "sizeType"                   "sizeAssocName"             
## [17] "sizeAssocUnit"              "sizeAssocValue"            
## [19] "sizeAssocSD"                "sizeAssocN"                
## [21] "sizeAssocReference"         "location"                  
## [23] "longitude"                  "latitude"                  
## [25] "notes"                      "observationNumber"         
## [27] "primaryReference"           "secondaryReference"        
## [29] "primaryReferenceDOI"        "secondaryReferenceDOI"     
## [31] "aphiaID"                    "aphiaAuthority"            
## [33] "rank"                       "kingdom"                   
## [35] "phylum"                     "class"                     
## [37] "order"                      "family"                    
## [39] "genus"                      "species"                   
## [41] "majorgroup"                 "traitValueSource"          
## [43] "traitValueMeasurementNotes" "uploadBy"                  
## [45] "uploadDate"                 "maxObsNum"                 
## [47] "verbatimScientificName"     "verbatimTraitName"         
## [49] "verbatimTraitValue"         "verbatimTraitUnit"         
## [51] "verbatimLifeStage"          "verbatimNotes"             
## [53] "isMerged"                   "mergedTraitTaxonIDs"
# What traits are available?
# The current dataset has 36 traits distributed to 81 traitNames and assigned to 5 general trait buckets. Traits have a dataset-specific traitName which were assigned with standardized trait units. A single trait may be represented in multiple ways and thus have unique traitIDs and traitNames (e.g., the nitrogen content trait can be "nitrogenTotal" which is the bulk total nitrogen of an individual in mg or "nitrogenPDW" which is the percent of nitrogen relative to the dry weight). 
length(unique(directory.trait$traitName))
## [1] 83
length(unique(directory.trait$trait))
## [1] 36
unique(directory.trait$traitBucket)
## [1] "morphological" "composition"   "behavioral"    "physiological"
## [5] "life history"
unique(directory.trait$dataType)
## [1] "continuous"  "categorical" "binary"
# Table of traits
kable(select(directory.trait, traitBucket, trait, traitName, traitUnit, dataType))
traitBucket trait traitName traitUnit dataType
morphological body length bodyLengthMax mm continuous
morphological dry weight dryWeight mg continuous
morphological wet weight wetWeight mg continuous
morphological ash free dry weight afdwPDW percent continuous
morphological carbon weight carbonWeight mg continuous
morphological water content waterPWW percent continuous
morphological water content dryWeightPWW percent continuous
morphological myelination myelination NA categorical
morphological bioluminescence bioluminescence NA categorical
composition carbon content carbonPDW percent continuous
composition carbon content carbonPWW percent continuous
composition nitrogen content nitrogenTotal mg continuous
composition nitrogen content nitrogenPDW percent continuous
composition phosphorus content phosphorusTotal mg continuous
composition phosphorus content phosphorusPDW percent continuous
composition elemental ratio ratioCN NA continuous
composition elemental ratio ratioCP NA continuous
composition elemental ratio ratioNP NA continuous
composition ash content ashTotal mg continuous
composition ash content ashPDW percent continuous
composition lipid content lipidPDW percent continuous
composition lipid content lipidPWW percent continuous
composition protein content proteinPDW percent continuous
composition protein content proteinPWW percent continuous
composition carbohydrate content carbohydratePDW percent continuous
composition proximate ratio ratioProteinLipid NA continuous
composition energy content energyContent J continuous
composition energy density energyDensityDW J mg DW^-1 continuous
composition energy density energyDensityWW J mg WW^-1 continuous
composition energy density energyDensityVolume J m^-3 continuous
behavioral clearance rate clearanceRate_15C ml h^-1 continuous
behavioral clearance rate clearanceRate_WSC_15C ml mg C^-1 h^-1 continuous
behavioral ingestion rate ingestionRate_15C ug C h^-1 continuous
behavioral ingestion rate ingestionRate_WSC_15C ug C mg C^-1 h^-1 continuous
physiological respiration rate respirationRate_15C ul O2 h^-1 continuous
physiological respiration rate respirationRate_WSDW_15C ul O2 mg DW^-1 h^-1 continuous
physiological respiration rate respirationRate_WSWW_15C ul O2 mg WW^-1 h^-1 continuous
physiological respiration rate respirationRate_WSC_15C ul O2 mg C^-1 h^-1 continuous
physiological excretion rate N excretionRateN_15C ug N-NH4+ h^-1 continuous
physiological excretion rate N excretionRateN_WSDW_15C ug N-NH4+ mg DW^-1 h^-1 continuous
physiological excretion rate P excretionRateP_15C ug P-PO43- h^-1 continuous
physiological excretion rate P excretionRateP_WSDW_15C ug P-PO43- mg DW^-1 h^-1 continuous
physiological growth rate growthRate_15C mg C h^-1 continuous
physiological growth rate growthRate_WSC_15C mg C mg C^-1 h^-1 continuous
behavioral feeding mode feedingMode NA categorical
behavioral reproduction mode reproductionMode NA categorical
behavioral trophic group trophicGroup NA categorical
behavioral vertical distribution verticalDistribution NA categorical
behavioral habitat association habitatAssociation NA categorical
behavioral diel vertical migration dielVerticalMigration NA categorical
life history hibernation hibernationStage NA categorical
life history clutch size clutchSize NA continuous
life history fecundity fecundity NA continuous
life history egg size eggWeight ug continuous
life history egg size eggDiameter um continuous
life history development duration developmentDuration day continuous
morphological myelination MYE.absent NA binary
morphological myelination MYE.present NA binary
morphological bioluminescence BIO.absent NA binary
morphological bioluminescence BIO.present NA binary
behavioral feeding mode FM.passive NA binary
behavioral feeding mode FM.active NA binary
behavioral feeding mode FM.cruise NA binary
behavioral feeding mode FM.current NA binary
behavioral feeding mode FM.ambush NA binary
behavioral feeding mode FM.active.ambush NA binary
behavioral feeding mode FM.passive.ambush NA binary
behavioral feeding mode FM.particle.feeder NA binary
behavioral feeding mode FM.parasite NA binary
behavioral feeding mode FM.commensal NA binary
behavioral reproduction mode RM.broadcasting NA binary
behavioral reproduction mode RM.brooding NA binary
behavioral reproduction mode RM.asexual NA binary
behavioral trophic group TG.carnivore NA binary
behavioral trophic group TG.omnivore NA binary
behavioral trophic group TG.herbivore NA binary
behavioral trophic group TG.detritivore NA binary
behavioral diel vertical migration DVM.absent NA binary
behavioral diel vertical migration DVM.present NA binary
behavioral diel vertical migration DVM.reverse NA binary
behavioral diel vertical migration DVM.weak NA binary
behavioral diel vertical migration DVM.strong NA binary
life history hibernation HIB.present NA binary
# The trait dataset contains information in 3 data types. "Continuous" are numerical traits while categorical traits are coded as characters. Categorical traits may also be represented as binary records which can be filtered using dataType == "binary". For binary records, 1 represents presence of a trait and 0 represents absence.
unique(directory.trait$dataType)
## [1] "continuous"  "categorical" "binary"
# For now, we can review the coverage of traits without the binary traits.
zooplankton.traits %>%
  filter(dataType == "binary") %>% 
  distinct(traitName)
## # A tibble: 27 × 1
##    traitName        
##    <chr>            
##  1 TG.omnivore      
##  2 TG.herbivore     
##  3 TG.carnivore     
##  4 TG.detritivore   
##  5 FM.passive       
##  6 FM.active.ambush 
##  7 FM.ambush        
##  8 FM.active        
##  9 FM.current       
## 10 FM.passive.ambush
## # … with 17 more rows

Trait coverage

Species-TraitName

This lists the traits and counts the number of species-level records for a specific traitName.

# This is the number of taxon observation per trait. The number of taxa with trait records vary with bodyLengthMax having the most trait data. The coverage of rate trait data is limited because this dataset includes only trait observations with temperature information.
traits.summary <- zooplankton.traits %>% 
  filter(dataType != "binary") %>% 
  # Add information about the trait bucket 
  left_join(distinct(directory.trait, traitBucket, traitID),
            by = "traitID") %>% 
  # Assign trait buckets as factors
  mutate(traitBucket = factor(traitBucket, 
                              levels = c("morphological","composition",
                                         "physiological","behavioral",
                                         "life history"))) %>% 
  # Isolate relevant columns
  distinct(traitBucket, traitID, traitName, taxonID) %>% 
  # Add up the number of taxon records per trait
  group_by(traitBucket, traitName, traitID) %>% 
  summarise(Nspecies = n()) %>% 
  ungroup() %>% 
  # Arrange the traits in decreasing order for the figure
  arrange(-Nspecies) %>% 
  mutate(traitName = fct_reorder(traitName, Nspecies))
## `summarise()` has grouped output by 'traitBucket', 'traitName'. You can
## override using the `.groups` argument.
# Stacked barplot with count at end
trait.bar <- ggplot(traits.summary, aes(x = traitName, y = Nspecies)) +
  geom_bar(stat = "identity") +
  # Set axis break
  scale_y_continuous(breaks = c(0, 250, 500, 750, 1000, 3000),
                     limits = c(0, 3180)) +
  scale_y_break(c(1250,2980), expand = FALSE) +
  # Add number of species at the end of the bar
  geom_text(aes(label = after_stat(y), group = traitName),
            stat = "summary", fun = sum, hjust = -0.1,
            size = 3) +
  coord_flip() +
  xlab("Trait") + ylab("# Species") +
  # facets proportioned by number of traits
  facet_grid(traitBucket ~., scales = "free_y", space = "free", switch = "y") +
  theme(strip.placement = "outside") 
trait.bar

## Species-TraitName by major group We can visualize the taxonomic coverage by major group.

# Assign colors to major groups
majorgroup.colors <- data.frame(
  majorgroup = c("Calanoid","Non-calanoid","Amphipod","Decapod","Euphausiid",
                 "Mysid","Ostracod","Polychaete","Pteropod",
                 "Chaetognath","Appendicularian","Thaliacean","Cladoceran",
                 "Hydromedusae","Siphonophore","Scyphomedusae","Ctenophore"),
  color = c("blue","yellowgreen","cornflowerblue","lightseagreen","forestgreen",
            "darkseagreen2","cyan3","tan1","olivedrab",
            "darkorchid","violetred","hotpink3","saddlebrown",
            "tomato1","goldenrod3","yellow3","red4"))
            
# This is similar to the chunk above but with different groupings
traits.summary <- zooplankton.traits %>% 
  filter(dataType != "binary") %>% 
  # Add information about the trait bucket 
  left_join(distinct(directory.trait, traitBucket, traitID),
            by = "traitID") %>% 
  # Assign trait buckets as factors
  mutate(traitBucket = factor(traitBucket, 
                              levels = c("morphological","composition",
                                         "physiological","behavioral",
                                         "life history"))) %>% 
  # Isolate relevant columns
  distinct(traitBucket, traitID, traitName, majorgroup, taxonID) %>% 
  # Add up the number of taxon records per trait
  group_by(traitBucket, traitName, majorgroup, traitID) %>% 
  # Set the levels of the major groups
  mutate(majorgroup = factor(majorgroup, 
                             levels = majorgroup.colors$majorgroup)) %>% 
  summarise(Nspecies = n()) %>% 
  ungroup() %>% 
  # Arrange the traits in decreasing order for the figure
  group_by(traitBucket, traitName) %>% 
  mutate(Nspecies.total = sum(Nspecies)) %>% 
  ungroup() %>% 
  arrange(-Nspecies.total) %>% 
  mutate(traitName = fct_reorder(traitName, Nspecies.total)) 
## `summarise()` has grouped output by 'traitBucket', 'traitName', 'majorgroup'.
## You can override using the `.groups` argument.
# Stacked barplot with count at end
trait.bar <- ggplot(traits.summary, aes(x = traitName, y = Nspecies,
                                        fill = majorgroup)) +
  geom_bar(stat = "identity") +
  # Set axis break
  scale_y_continuous(breaks = c(0, 250, 500, 750, 1000, 3000),
                     limits = c(0, 3180)) +
  scale_y_break(c(1250,2980), expand = FALSE) +
  # Add number of species at the end of the bar
  geom_text(aes(label = after_stat(y), group = traitName),
            stat = "summary", fun = sum, hjust = -0.1,
            size = 3) +
  scale_fill_manual(values = majorgroup.colors$color,
                    limits = majorgroup.colors$majorgroup,
                    name = "Major group") +
  coord_flip() +
  xlab("Trait") + ylab("# Species") +
  # facets proportioned by number of traits
  facet_grid(traitBucket ~., scales = "free_y", space = "free", switch = "y") +
  theme(strip.placement = "outside") 
trait.bar

Number of traits per species

Here, we inspect the number of traits per species and continue to check how evenly distributed the dataset is. This figure shows that most species have only a few traits assigned to them, but there are a few species with a lot of trait information known.

traits.summary.2 <- zooplankton.traits %>% 
  filter(dataType != "binary") %>% 
  distinct(traitName, scientificName, taxonID) %>% 
  group_by(taxonID, scientificName) %>% 
  summarise(Ntraits = n()) %>% 
  arrange(-Ntraits)
## `summarise()` has grouped output by 'taxonID'. You can override using the
## `.groups` argument.
head(traits.summary.2)
## # A tibble: 6 × 3
## # Groups:   taxonID [6]
##   taxonID scientificName       Ntraits
##     <dbl> <chr>                  <int>
## 1     630 Calanus finmarchicus      47
## 2    3023 Euphausia pacifica        45
## 3    1478 Neocalanus plumchrus      43
## 4    1475 Neocalanus cristatus      42
## 5    3079 Aglantha digitale         42
## 6    4894 Aurelia aurita            41
traits.summary.2 <- traits.summary.2 %>% 
  group_by(Ntraits) %>% 
  summarise(Ntraits.total = sum(Ntraits)) %>% 
  ungroup() %>% 
  arrange(-Ntraits)

ggplot(traits.summary.2, aes(x = Ntraits, y = Ntraits.total)) +
  geom_point(size = 2, colour = "black") +
  geom_segment( aes(x=Ntraits, xend=Ntraits, y=0, yend=Ntraits.total)) +
  xlab("Number of traits per species") +
  ylab("Number of records")

Inspect specific traits

Numerical traits: average and range of trait between major groups

# Numerical traits
# As an example, we can inspect the range of body lengths in each major group.
body.length <- zooplankton.traits %>% 
  filter(traitName == "bodyLengthMax") %>% 
  mutate(traitValue = as.numeric(traitValue)) %>% 
  # calculate the mean to sort major groups
  group_by(majorgroup) %>% 
  mutate(mean.size = mean(traitValue)) %>% 
  ungroup() %>% 
  mutate(majorgroup = fct_reorder(majorgroup, mean.size)) 
  
# In this dataset, body length varies across four orders of magnitude. Many major groups have overlaping and similar body sizes. This suggests that if we approach research questions from, say, a size-spectrum perspective, taxonomic differences between major groups might not be that important.
ggplot(body.length, aes(x = majorgroup, y = traitValue)) +
  geom_violin() +
  # visualize the mean
  stat_summary(fun.data = "mean_cl_boot", geom = "pointrange",
               colour = "red") +
  scale_y_continuous(labels = scaleFUN, trans = "log10") +
  theme(axis.text.x = element_text(angle = 45,  hjust=1))
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Removed 1 rows containing missing values (`geom_segment()`).

# Although, there would be taxonomic nuances in how size is measured. A measurement of total body length for individuals with an elongated body plan is typical but for more spherical and/or colonial organisms, length could be based on a specific body axis or individual in colonial organisms. Below lists the various body length axes in the dataset.
(cleanStrings(unique(body.length$sizeType)))
## [1] "anterior nectophore length; bell height; bell width; body height; body width; nectophore height; nectophore length; nectophore width; not given; posterior nectophore length; prosome length; shell length; total length; trunk length"
# One important trait is nitrogen content. For example, this can be used a currency in biogeochemical models.  When considering composition traits, it is important to distinguish total or percent-weight trait values and choose which would be most appropirate for a particular analysis.
nitrogen.content <- zooplankton.traits %>% 
  # inspect total nitrogen content and the fraction of dry weight which is nitrogen
  filter(traitName %in% c("nitrogenTotal", "nitrogenPDW")) %>% 
  mutate(traitValue = as.numeric(traitValue))
  
ggplot(nitrogen.content, aes(x = majorgroup, y = traitValue)) +
  geom_violin() +
  # visualize the mean
  stat_summary(fun.data = "mean_cl_boot", geom = "pointrange",
               colour = "red") +
  scale_y_continuous(labels = scaleFUN, trans = "log10") +
  theme(axis.text.x = element_text(angle = 45,  hjust=1)) +
  facet_wrap(~traitName, ncol = 1, scales = "free_y")
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Warning: Removed 2 rows containing missing values (`geom_segment()`).
## Removed 2 rows containing missing values (`geom_segment()`).

Categorical traits: inspect proportions

In this dataset, categorical traits, particularly behavioral traits, are some of the most common records. Although numerical traits are useful (and much needed) in quantitative models, categorical traits are often used to characterize zooplankton assemblages or even as predictor variables. Here we inspect some of the commonly utilized categorical traits in zooplankton studies: reproduction mode, feeding mode, trophic group, and vertical distribution.

Note that in all these traits, an organism can be associated with multiple trait values (e.g., salps have a sexual broadcasting reproduction strategy and an asexual reproduction strategy). This should be considered when performing analysis using categorical traits. For now we will visualize all instances of a categorical trait.

# Visualize the number of species associated with a particular categorical trait
vertical.distribution <- zooplankton.traits %>% 
  filter(traitName == "verticalDistribution") %>% 
  group_by(traitValue) %>% 
  summarise(Nspecies = n())

ggplot(vertical.distribution, aes(x = traitValue, y = Nspecies)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  xlab("Vertical distribution") + ylab("# Species") +
  scale_x_discrete(limits = rev(c("epipelagic","epimesopelagic","epibathypelagic",
                              "mesopelagic","mesobathypelagic","bathypelagic")))

# The trait information could be accessed by calling the categorical version of the trait (e.g., "reproductionMode"), but for some instances, using the binary version would be easier in analysis or visuals.
repro.mode <- zooplankton.traits %>% 
  filter(traitName %in% c("RM.broadcasting","RM.brooding","RM.asexual"),
         traitValue == 1) %>% 
  # rename trait levels by removing "RM." prefix
  mutate(traitName = str_replace(traitName, "RM.","")) 

# Bar plot of total species with a particular trait value, organized by major group.
repro.mode <- repro.mode %>% 
  group_by(traitName, majorgroup) %>% 
  summarise(Nspecies = n()) 
## `summarise()` has grouped output by 'traitName'. You can override using the
## `.groups` argument.
ggplot(repro.mode, aes(x = traitName, y = Nspecies, 
                       fill = majorgroup)) +
   geom_bar(stat = "identity") +
  scale_fill_manual(values = majorgroup.colors$color,
                    limits = majorgroup.colors$majorgroup,
                    name = "Major group") +
  coord_flip() +
  xlab("Reproduction mode") + ylab("# Species") 

# Proportion of reproduction modes per major group
ggplot(repro.mode, aes(x = majorgroup, y = Nspecies, 
                       fill = traitName)) +
  geom_bar(position = "fill", stat = "identity") +
  coord_flip() +
  scale_y_continuous(labels = scales::percent)  + 
  scale_x_discrete(limits = rev) +
  xlab("Major group") + ylab("% Species") 

# Relationship between trophic group and feeding mode

# There are different ways in which the feeding mode trait is coded. 
unique(filter(zooplankton.traits, grepl("FM.",traitName))$traitName)
##  [1] "FM.passive"         "FM.active.ambush"   "FM.ambush"         
##  [4] "FM.active"          "FM.current"         "FM.passive.ambush" 
##  [7] "FM.cruise"          "FM.particle.feeder" "FM.parasite"       
## [10] "FM.commensal"
# For now, we can consider, ambush, cruise, and current feeding and check if there might be a relationship between feeding mode and the general trophic group a species is often clustered into.
feeding.trophic <- zooplankton.traits %>% 
  filter(traitName %in% c("FM.ambush","FM.cruise","FM.current","FM.particle.feeder"),
         traitValue == 1) %>% 
  # rename the trait variable
  rename(feeding.mode = traitName) %>% 
  # rename trait levels by removing "RM." prefix
  mutate(feeding.mode = str_replace(feeding.mode, "FM.","")) %>% 
  # merge with the column on trophic groups
  left_join(select(filter(zooplankton.traits,
                          traitName %in% c("TG.omnivore","TG.herbivore",
                                           "TG.carnivore","TG.detritivore")),
                   taxonID, trophic.group = traitName),
            by = "taxonID") %>% 
  mutate(trophic.group = str_replace(trophic.group, "TG.","")) %>% 
  relocate(feeding.mode, trophic.group) %>% 
  # only consider instances when information for both traits are available
  filter(!is.na(feeding.mode) & !is.na(trophic.group)) %>% 
  group_by(feeding.mode, trophic.group) %>% 
  summarise(Nspecies = n())
## `summarise()` has grouped output by 'feeding.mode'. You can override using the
## `.groups` argument.
ggplot(feeding.trophic, aes(x = trophic.group, y = Nspecies,
                            fill = feeding.mode)) + 
  geom_bar(position = "fill", stat = "identity") +
  scale_y_continuous(labels = scales::percent)  + 
  xlab("Major group") + ylab("% Species") 

Size as a predictor of trait distributions

Size is commonly referred to as a “master trait” because many biological attributes scales with size. There are multiple measures of size such as length or biomass. For zooplankton, this is is an important concern when considering both crustaceans and gelatinous or soft-bodied species. Gelatinous zooplankton have an “inflated” size because they have a higher water content. Note that for the figures below, we are simply using the geom_smooth() function of ggplot to estimate and visualize a linear model. If we to formally derive a regression, care must be taken in choosing the regression model and data curation.

unique(zooplankton.traits$traitName)
##  [1] "carbonWeight"             "dryWeight"               
##  [3] "afdwPDW"                  "wetWeight"               
##  [5] "dryWeightPWW"             "waterPWW"                
##  [7] "carbonPDW"                "nitrogenPDW"             
##  [9] "ratioNP"                  "carbonPWW"               
## [11] "ratioCN"                  "phosphorusPDW"           
## [13] "ratioCP"                  "nitrogenTotal"           
## [15] "phosphorusTotal"          "proteinPDW"              
## [17] "lipidPDW"                 "ashPDW"                  
## [19] "proteinPWW"               "lipidPWW"                
## [21] "ratioProteinLipid"        "carbohydratePDW"         
## [23] "ashTotal"                 "energyDensityWW"         
## [25] "energyContent"            "energyDensityVolume"     
## [27] "energyDensityDW"          "clearanceRate_15C"       
## [29] "clearanceRate_WSC_15C"    "ingestionRate_15C"       
## [31] "ingestionRate_WSC_15C"    "respirationRate_15C"     
## [33] "respirationRate_WSC_15C"  "growthRate_15C"          
## [35] "growthRate_WSC_15C"       "clutchSize"              
## [37] "fecundity"                "eggWeight"               
## [39] "eggDiameter"              "developmentDuration"     
## [41] "respirationRate_WSDW_15C" "respirationRate_WSWW_15C"
## [43] "excretionRateN_15C"       "excretionRateN_WSDW_15C" 
## [45] "excretionRateP_15C"       "excretionRateP_WSDW_15C" 
## [47] "bodyLengthMax"            "feedingMode"             
## [49] "reproductionMode"         "trophicGroup"            
## [51] "verticalDistribution"     "dielVerticalMigration"   
## [53] "hibernationStage"         "myelination"             
## [55] "bioluminescence"          "habitatAssociation"      
## [57] "TG.omnivore"              "TG.herbivore"            
## [59] "TG.carnivore"             "TG.detritivore"          
## [61] "FM.passive"               "FM.active.ambush"        
## [63] "FM.ambush"                "FM.active"               
## [65] "FM.current"               "FM.passive.ambush"       
## [67] "FM.cruise"                "FM.particle.feeder"      
## [69] "FM.parasite"              "FM.commensal"            
## [71] "RM.broadcasting"          "RM.brooding"             
## [73] "RM.asexual"               "MYE.absent"              
## [75] "MYE.present"              "DVM.absent"              
## [77] "DVM.present"              "DVM.weak"                
## [79] "DVM.reverse"              "DVM.strong"              
## [81] "HIB.present"              "BIO.absent"              
## [83] "BIO.present"
# Because body length is a commonly reported size measurement, we can inspect how the measures of weight scale with length.
size.traits <- zooplankton.traits %>% 
  # filter species with weight measurements
  filter(traitName %in% c("carbonWeight", "dryWeight", "wetWeight")) %>% 
  # Append body length as a column for comparison
  left_join(dplyr::select(filter(zooplankton.traits, 
                                 traitName == "bodyLengthMax"),
                          taxonID, bodyLength = traitValue),
            by = "taxonID") %>% 
  # Filter out instances when there are no length information.
  filter(!is.na(bodyLength)) %>% 
  # Make sure to convert continuous traits to numeric
  mutate(traitValue = as.numeric(traitValue),
         bodyLength = as.numeric(bodyLength))
  
# It is clear that weight scales with length, but there seems to be quite some variability in this relationship.
size.plot <- ggplot(data = size.traits, 
                    aes(x = bodyLength, y = traitValue, color = traitName)) +
  geom_point() +
  # Add labels for inspecting points
  geom_point(aes(text = paste0(scientificName," (",majorgroup,")"))) +
  # Add a linear model
  geom_smooth(method = "lm") +
  xlab("Length (mm)") + ylab("Weight (mg)") +
  # Log transform axis scales
  scale_x_continuous(labels = scaleFUN, trans = "log10") +
  scale_y_continuous(labels = scaleFUN, trans = "log10") 
## Warning in geom_point(aes(text = paste0(scientificName, " (", majorgroup, :
## Ignoring unknown aesthetics: text
ggplotly(size.plot)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
# To further inspect this scaling, we can separate hard-bodied crustacean zooplankton from gelatinous and soft-bodied zooplankton. 
size.traits <- size.traits %>% 
  # Assign a body plan grouping variable
  mutate(bodyPlan = if_else(phylum == "Arthropoda","hard-body","soft-body"))

# Focus on carbon weight. Here we see that body plan mediates the slope of the weight - length relationship.
size.plot.2 <- ggplot(data = filter(size.traits, traitName == "carbonWeight"), 
                      aes(x = bodyLength, y = traitValue, color = bodyPlan)) +
  # Add labels for inspecting points
  geom_point(aes(text = paste0(scientificName," (",majorgroup,")"))) +
  # Add a linear model
  geom_smooth(method = "lm") +
  xlab("Length (mm)") + ylab("Weight (mg)") +
  # Log transform axis scales
  scale_x_continuous(labels = scaleFUN, trans = "log10") +
  scale_y_continuous(labels = scaleFUN, trans = "log10") 
## Warning in geom_point(aes(text = paste0(scientificName, " (", majorgroup, :
## Ignoring unknown aesthetics: text
ggplotly(size.plot.2)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
# Allometric scaling of physiological rates with size
# A common application of size as a master trait is using it as a predictor for other traits, especially physiological rates. Here, we inspect respiration rate, which is the most common rate trait in the dataset. Carbon weight will be used as the measure for size.
rate.size <- zooplankton.traits %>% 
  # select respiration rate traits
  filter(traitName %in% c("respirationRate_15C", "respirationRate_WSC_15C")) %>%
  # Append carbon weight as a column for comparison
  left_join(dplyr::select(filter(zooplankton.traits, 
                                 traitName == "carbonWeight"),
                          taxonID, carbonWeight = traitValue),
            by = "taxonID") %>% 
  # Filter out instances when there are no carbon weight information.
  filter(!is.na(carbonWeight)) %>% 
  # Make sure to convert continuous traits to numeric
  mutate(traitValue = as.numeric(traitValue),
         carbonWeight = as.numeric(carbonWeight))

# Here we see carbon weight scales with size. Weight-specific rate values often have flatter slope, and sometimes negative slope. 
rate.size.plot <- ggplot(data = rate.size, 
                      aes(x = carbonWeight, y = traitValue, color = traitName)) +
  # Add labels for inspecting points
  geom_point(aes(text = paste0(scientificName," (",majorgroup,")"))) +
  # Add a linear model
  geom_smooth(method = "lm") +
  # xlab("Length (mm)") + ylab("Weight (mg)") +
  # Log transform axis scales
  scale_x_continuous(labels = scaleFUN, trans = "log10") +
  scale_y_continuous(labels = scaleFUN, trans = "log10") 
## Warning in geom_point(aes(text = paste0(scientificName, " (", majorgroup, :
## Ignoring unknown aesthetics: text
ggplotly(rate.size.plot)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
# Now, can inspect the size scaling of the excretion rate of N as ammonia. The data source for excretion rates used dry weight as the associated size measure, so we reference excretion rate relative to dry weight.
rate.size.2 <- zooplankton.traits %>% 
  # select respiration rate traits
  filter(traitName %in% c("excretionRateN_15C", "excretionRateN_WSDW_15C")) %>%
  # Append dry weight as a column for comparison
  left_join(dplyr::select(filter(zooplankton.traits, 
                                 traitName == "dryWeight"),
                          taxonID, dryWeight = traitValue),
            by = "taxonID") %>% 
  # Filter out instances when there are no dry weight information.
  filter(!is.na(dryWeight)) %>% 
  # Make sure to convert continuous traits to numeric
  mutate(traitValue = as.numeric(traitValue),
         dryWeight = as.numeric(dryWeight))


rate.size.plot.2 <- ggplot(data = rate.size.2, 
                      aes(x = dryWeight, y = traitValue, color = traitName)) +
  # Add labels for inspecting points
  geom_point(aes(text = paste0(scientificName," (",majorgroup,")"))) +
  # Add a linear model
  geom_smooth(method = "lm") +
  # xlab("Length (mm)") + ylab("Weight (mg)") +
  # Log transform axis scales
  scale_x_continuous(labels = scaleFUN, trans = "log10") +
  scale_y_continuous(labels = scaleFUN, trans = "log10") 
## Warning in geom_point(aes(text = paste0(scientificName, " (", majorgroup, :
## Ignoring unknown aesthetics: text
ggplotly(rate.size.plot.2)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Relationship between traits

We can further look into how a trait might affect the distribution of other traits or the allometric scaling relationship of traits.

rate.size <- zooplankton.traits %>% 
  # select respiration rate traits
  filter(traitName %in% c("respirationRate_15C", "respirationRate_WSC_15C")) %>%
  # Append carbon weight as a column for comparison
  left_join(dplyr::select(filter(zooplankton.traits, 
                                 traitName == "carbonWeight"),
                          taxonID, carbonWeight = traitValue),
            by = "taxonID") %>% 
  # Filter out instances when there are no carbon weight information.
  filter(!is.na(carbonWeight)) %>% 
  # Make sure to convert continuous traits to numeric
  mutate(traitValue = as.numeric(traitValue),
         carbonWeight = as.numeric(carbonWeight))

# One broad way to classify zooplankton is whether they have a passive or active feeding mode. Passive feeding include ambush feeding behavior while active feeding includes raptorial cruise feeding and current or filter-feeding.
fm.activity <- zooplankton.traits %>% 
  filter(traitName %in% c("FM.passive","FM.active"),
         traitValue == 1) %>% 
  # transform for merging with the size data
  select(taxonID, feeding.activity = traitName) %>% 
  # Check if there are multiple records for the same taxon. This may not be a concern because some species employ multiple behavioral strategies. Note that this may result in duplicated rows when doing a left join. 
  group_by(taxonID) %>% 
  mutate(nvals = n()) %>% 
  arrange(-nvals)

# We know that respiration rate scales with size, but does feeding activity mediate this relationship? To explore this question, we include a feeding activity column to the rate-size figure. Here we see that active feeders have a slightly higher respiration rate compared to passive feeders. But is this a significant difference? Could the taxonomic coverage of the species included in this analysis limit what we can infer? Would it be more informative if the individual-level trait data was used?

rate.size.activity <- rate.size %>% 
  left_join(fm.activity, by = "taxonID") %>% 
  relocate(feeding.activity) %>% 
  filter(!is.na(feeding.activity))

rate.activity.plot <- ggplot(data = filter(rate.size.activity, 
                                           traitName == "respirationRate_15C"),
                             aes(x = carbonWeight, y = traitValue, color = feeding.activity)) +
  # Add labels for inspecting points
  geom_point(aes(text = paste0(scientificName," (",majorgroup,")"))) +
  # Add a linear model
  geom_smooth(method = "lm") +
  ylab("Respiration rate") +
  # Log transform axis scales
  scale_x_continuous(labels = scaleFUN, trans = "log10") +
  scale_y_continuous(labels = scaleFUN, trans = "log10") 
## Warning in geom_point(aes(text = paste0(scientificName, " (", majorgroup, :
## Ignoring unknown aesthetics: text
ggplotly(rate.activity.plot)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?